[TFT 게임 각 서버 Top10 플레이어들의 매치 데이터 분석]¶

초기 설정¶

In [6]:
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import folium
import seaborn as sns
import json
import plotly.express as px
from itertools import combinations
from collections import defaultdict
from folium.plugins import HeatMap
from matplotlib import font_manager
from folium.plugins import MarkerCluster
from folium import GeoJson
from folium import LinearColormap
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from sklearn.preprocessing import StandardScaler
from matplotlib import gridspec

font_path = "C:/Windows/Fonts/malgun.ttf"
font_family = font_manager.FontProperties(fname=font_path).get_name()
plt.rcParams["font.family"] = font_family
plt.rcParams["axes.unicode_minus"] = False
# plt.rc("font", family="Malgun Gothic")

pd.options.display.max_rows = 1000
In [7]:
%matplotlib inline
In [8]:
%%HTML
<script src="require.js"></script>
In [9]:
REGIONS_INFO = {
    "BR1": "AMERICAS",
    "EUN1": "EUROPE",
    "EUW1": "EUROPE",
    "JP1": "ASIA",
    "KR": "ASIA",
    "LA1": "AMERICAS",
    "LA2": "AMERICAS",
    "NA1": "AMERICAS",
    "OC1": "SEA",
    "PH2": "SEA",
    "RU": "EUROPE",
    "SG2": "SEA",
    "TH2": "SEA",
    "TR1": "EUROPE",
    "TW2": "SEA",
    "VN2": "SEA",
}

사용할 데이터 프레임 생성¶

In [10]:
player = pd.read_csv("../data/0819/player.csv")
player_stat = pd.read_csv("../data/0819/top10_player.csv")
match = pd.read_csv("../data/0819/match.csv")
match_player = pd.read_csv("../data/0819/match_player.csv")
match_trait = pd.read_csv("../data/0819/match_trait.csv")
match_unit = pd.read_csv("../data/0819/match_unit.csv")

버전 정보 확인¶

In [11]:
match.groupby(['version_major', 'version_minor', 'version_patch']).count()
Out[11]:
match_id match_date match_length version_date tft_set_number
version_major version_minor version_patch
13 14 522 39 39 39 39 39
15 523 45 45 45 45 45
524 1332 1332 1332 1332 1332
16 525 1456 1456 1456 1456 1456
In [12]:
VERSION_MAJOR = match['version_major'].max()
VERSION_MINOR = match['version_minor'].max()
VERSION_PATCH = match['version_patch'].max()
print(VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)
13 16 525

한글화 처리¶

한글화 준비¶

In [13]:
file_path = "../json/ko_kr.json"
with open(file_path, "r") as json_file:
    data_dragon = json.load(json_file)

# 아이템, 유닛, 특성 정보의 데이터프레임
items = pd.DataFrame(data_dragon["items"])
set9 = pd.DataFrame(data_dragon["sets"])["9"]
set9_units = pd.DataFrame(set9["champions"])
set9_traits = pd.DataFrame(set9["traits"])

# 각 데이터프레임으로부터 딕셔너리 생성
item_names = dict(zip(items["apiName"].str.lower(), items["name"].str.lower()))
unit_names = dict(
    zip(set9_units["apiName"].str.lower(), set9_units["name"].str.lower())
)
trait_names = dict(
    zip(set9_traits["apiName"].str.lower(), set9_traits["name"].str.lower())
)

한글화 적용¶

In [14]:
match_trait["name"] = match_trait["name"].apply(lambda x: trait_names[x.lower()])
match_unit["name"] = match_unit["name"].apply(lambda x: unit_names[x.lower()])
match_unit["item1"] = match_unit["item1"].apply(
    lambda x: item_names[x.lower()] if isinstance(x, str) else ""
)
match_unit["item2"] = match_unit["item2"].apply(
    lambda x: item_names[x.lower()] if isinstance(x, str) else ""
)
match_unit["item3"] = match_unit["item3"].apply(
    lambda x: item_names[x.lower()] if isinstance(x, str) else ""
)

데이터 전처리, 이상치 검출¶

Tukey outlier detection¶

In [15]:
def calculate_fence(data, coef=1.5):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_fence = q1 - coef * iqr
    upper_fence = q3 + coef * iqr
    return lower_fence, upper_fence

게임 길이를 기반으로 이상치 검출¶

In [16]:
# 게임을 제대로 플레이 하지 않은 경기 확인
fig = plt.figure(figsize=(10, 5))
fig.suptitle("게임 길이 기준 이상치 검출")

ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
sns.boxplot(match_player, y="last_round", color="skyblue", ax=ax1)
sns.boxplot(match_player, y="time_eliminated", color="skyblue", ax=ax2)

fig.tight_layout()
plt.show()
  • 항복이나 자리비움으로 인해 플레이어가 빨리 탈락하여 플레이 시간이 짧은 경우가 있습니다.
  • 데이터를 확인하여 위험성이 있는 조합(필트오버)의 경우를 제외하고는 데이터를 제거합니다.
In [17]:
last_round_lfence, last_round_ufence = calculate_fence(match_player["last_round"])
time_eliminated_lfence, time_eliminated_ufence = calculate_fence(
    match_player["time_eliminated"]
)

outlier_condition = (
    (match_player["last_round"] > last_round_ufence)
    | (match_player["last_round"] < last_round_lfence)
    | (match_player["time_eliminated"] > time_eliminated_ufence)
    | (match_player["time_eliminated"] < time_eliminated_lfence)
)
outlier_match_player = match_player[outlier_condition]

# 유닛을 한마리도 뽑지 않은 플레이어가 존재하기 때문에 left 조인
outlier_match_player = outlier_match_player.merge(
    match_unit, how="left", left_on="match_player_id", right_on="match_player_id"
)

outlier_match_player = outlier_match_player.loc[
    :,
    [
        "match_player_id",
        "last_round",
        "level",
        "placement",
        "time_eliminated",
        "name",
        "tier",
    ],
]
outlier_match_player = outlier_match_player.groupby(by="match_player_id").agg(
    {
        "last_round": "first",
        "level": "first",
        "placement": "first",
        "time_eliminated": "first",
        "name": list,
        "tier": "mean",
    }
)

# t-헥스를 뽑은 플레이어 제외
outlier_player = outlier_match_player[
    outlier_match_player["name"].apply(
        lambda champ_list: True if "t-헥스" not in champ_list else False
    )
]
# 결과 반영
# outlier_player
# 이상치 플레이어 데이터 제거, 같이 플레이 한 플레이어들의 데이터는 유지
preprocessed_match_player = match_player[
    ~match_player["match_player_id"].isin(outlier_player.index)
]
preprocessed_match_unit = match_unit[
    ~match_unit["match_player_id"].isin(outlier_player.index)
]
preprocessed_match_trait = match_trait[
    ~match_trait["match_player_id"].isin(outlier_player.index)
]
In [18]:
# 게임을 제대로 플레이 하지 않은 경기 확인
fig = plt.figure(figsize=(10, 5))
fig.suptitle("게임 길이 기준 이상치 검출")

ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)
sns.boxplot(preprocessed_match_player, y="last_round", color="skyblue", ax=ax1)
sns.boxplot(preprocessed_match_player, y="time_eliminated", color="skyblue", ax=ax2)

fig.tight_layout()
plt.show()

유닛 비용의 합으로 이상치 검출¶

In [19]:
# preprocessed_match_unit = preprocessed_match_unit.loc[:,['match_player_id', 'name', 'rarity', 'tier']]

# 하이머딩거 포탑은 코스트가 없기 때문에 제외
preprocessed_match_unit = preprocessed_match_unit[
    preprocessed_match_unit["name"] != "최첨단 포탑"
]

# 특수 유닛 제외
preprocessed_match_unit = preprocessed_match_unit[
    preprocessed_match_unit["rarity"] <= 6
]

# 유닛 코스트를 계산하기 위한 전처리를 수행
rarity_to_cost = {0: 1, 1: 2, 2: 3, 4: 4, 6: 5}
preprocessed_match_unit["tier"] = preprocessed_match_unit["tier"].apply(
    lambda x: 3 if x >= 4 else x
)
preprocessed_match_unit["single_cost"] = preprocessed_match_unit.loc[:, "rarity"].apply(
    lambda x: rarity_to_cost[x]
)
preprocessed_match_unit["cost"] = preprocessed_match_unit.loc[:, "single_cost"] * (
    3 ** (preprocessed_match_unit.loc[:, "tier"] - 1)
)

# 데이터 가공
revised_match_unit = preprocessed_match_unit.groupby(by="match_player_id").agg(
    {"name": list, "cost": "sum", "tier": "mean"}
)
revised_match_unit["unit_count"] = revised_match_unit["name"].apply(lambda x: len(x))
revised_match_unit = revised_match_unit.merge(
    preprocessed_match_player.loc[
        :, ["match_player_id", "placement", "last_round", "level", "time_eliminated"]
    ],
    how="inner",
    left_on="match_player_id",
    right_on="match_player_id",
)
revised_match_unit.rename(
    columns={"cost": "cost_sum", "tier": "avg_tier"}, inplace=True
)
# revised_match_unit[['cost_sum', 'avg_tier', 'unit_count', 'placement']].head(24)
In [20]:
fig = px.box(
    revised_match_unit,
    y="cost_sum",
    title="유닛 비용의 합 기준 이상치 검출",
    color_discrete_sequence=["#58F"],
    template="plotly",
)
fig
In [21]:
outlier_player = revised_match_unit[revised_match_unit["cost_sum"] <= 18][
    "match_player_id"
]

# 이상치 플레이어 데이터 제거, 같이 플레이 한 플레이어들의 데이터는 유지
print(
    "제거 전:",
    preprocessed_match_player.shape,
    preprocessed_match_unit.shape,
    preprocessed_match_trait.shape,
    revised_match_unit.shape,
)
preprocessed_match_player = preprocessed_match_player[
    ~preprocessed_match_player["match_player_id"].isin(outlier_player)
]
preprocessed_match_unit = preprocessed_match_unit[
    ~preprocessed_match_unit["match_player_id"].isin(outlier_player)
]
preprocessed_match_trait = preprocessed_match_trait[
    ~preprocessed_match_trait["match_player_id"].isin(outlier_player)
]
revised_match_unit = revised_match_unit[
    ~revised_match_unit["match_player_id"].isin(outlier_player)
]
print(
    "제거 후:",
    preprocessed_match_player.shape,
    preprocessed_match_unit.shape,
    preprocessed_match_trait.shape,
    revised_match_unit.shape,
)
제거 전: (22876, 7) (184304, 10) (245399, 7) (22864, 9)
제거 후: (22840, 7) (184177, 10) (245202, 7) (22828, 9)
In [22]:
fig = px.box(
    revised_match_unit,
    y="cost_sum",
    title="유닛 비용의 합 기준 이상치 검출",
    color_discrete_sequence=["#58F"],
    template="plotly",
)
fig

게임 도중 3성 유닛을 뽑기 위해 다른 유닛들을 판 데이터 제거¶

In [23]:
outlier_player = revised_match_unit[(revised_match_unit["unit_count"] < 5)][
    "match_player_id"
]
# 이상치 플레이어 데이터 제거, 같이 플레이 한 플레이어들의 데이터는 유지
print(
    "제거 전:",
    preprocessed_match_player.shape,
    preprocessed_match_unit.shape,
    preprocessed_match_trait.shape,
    revised_match_unit.shape,
)
preprocessed_match_player = preprocessed_match_player[
    ~preprocessed_match_player["match_player_id"].isin(outlier_player)
]
preprocessed_match_unit = preprocessed_match_unit[
    ~preprocessed_match_unit["match_player_id"].isin(outlier_player)
]
preprocessed_match_trait = preprocessed_match_trait[
    ~preprocessed_match_trait["match_player_id"].isin(outlier_player)
]
revised_match_unit = revised_match_unit[
    ~revised_match_unit["match_player_id"].isin(outlier_player)
]
print(
    "제거 후:",
    preprocessed_match_player.shape,
    preprocessed_match_unit.shape,
    preprocessed_match_trait.shape,
    revised_match_unit.shape,
)
제거 전: (22840, 7) (184177, 10) (245202, 7) (22828, 9)
제거 후: (22799, 7) (184035, 10) (245046, 7) (22787, 9)

각 서버 사용자 분포¶

서버에 해당하는 국가 매칭¶

In [24]:
SERVER_INFO = {
    "BR1": ("Brazil",),
    "EUN1": ("Sweden", "Norway", "Estonia", "Latvia"),
    "EUW1": ("Spain", "United Kingdom", "Belgium"),
    "JP1": ("Japan",),
    "KR": ("Korea",),
    "LA1": ("Mexico", "Columbia", "Peru"),
    "LA2": ("Bolivia", "Uruguay", "Chile"),
    "NA1": ("United States", "Canada"),
    "OC1": ("Australia", "New Zealand"),
    "PH2": ("Philippines",),
    "RU": ("Russia",),
    "SG2": ("Singapore", "Malaysia", "Indonesia"),
    "TH2": ("Thailand",),
    "TR1": ("Turkey",),
    "TW2": ("Taiwan", "Hong Kong", "Macao"),
    "VN2": ("Vietnam",),
}

지도 시각화할 데이터프레임을 생성¶

In [25]:
revised_match = match.copy()
revised_match['region'] = revised_match['match_id'].apply(lambda x: x.split("_")[0])
revised_match['continent'] = revised_match['region'].apply(lambda x: REGIONS_INFO[x])
revised_match['date'] = pd.to_datetime(revised_match['match_date'].apply(lambda x: x.split(" ")[0]))
revised_match['timestamp'] = revised_match['date'].apply(lambda x: x.timestamp())
In [26]:
player_count = revised_match.merge(match_player, how='inner', left_on = 'match_id', right_on = 'match_id')
player_count = player_count.drop_duplicates('puuid')
player_count = player_count.loc[:, ['continent', 'region', 'puuid']]
player_count = player_count.groupby(['continent', 'region']).count()
player_count = player_count.reset_index().sort_values("puuid", ascending=True)
player_count.rename(columns={"puuid": "count"}, inplace=True)

# 새로운 데이터를 담을 리스트 생성
new_data = []

# 각 지역의 국가 정보를 확장하여 리스트에 추가
for region, puuid in player_count.loc[:, ["region", "count"]].to_numpy():
    countries = SERVER_INFO.get(region, ())
    for country in countries:
        new_data.append((region, puuid, country))

# 새로운 데이터프레임 생성
country_player_count = pd.DataFrame(new_data, columns=["region", "count", "country"])

서버별 사용자 수 시각화¶

In [27]:
fig = px.bar(
    player_count,
    x="region",
    y="count",
    text_auto=".3s",
    color="continent",
    title="서버별 사용자 수",
    color_discrete_sequence=px.colors.sequential.Turbo[1::1],
)
fig.update_traces(
    textfont_size=12, textangle=0, textposition="outside", cliponaxis=False
)
fig.show()

서버별 사용자 분포 지도 시각화¶

In [28]:
# GeoJSON 파일 URL
url = "http://geojson.xyz/naturalearth-3.3.0/ne_50m_admin_0_countries.geojson"

# GeoJSON 파일을 GeoDataFrame으로 읽어옴
countries_geo = gpd.read_file(url)
In [29]:
# 기본 맵을 생성
map = folium.Map(location=(30, 10), zoom_start=2, tiles="cartodb positron")

bins = np.linspace(
    country_player_count["count"].min(), country_player_count["count"].max(), 16
)

# Choropleth Layer
choropleth_layer = folium.Choropleth(
    geo_data=countries_geo,
    data=country_player_count,
    columns=["country", "count"],
    key_on="feature.properties.name",
    fill_color="Blues",
    fill_opacity=0.7,
    line_opacity=0.3,
    nan_fill_color="white",
    legend_name="Player count",
    bins=bins,
)
choropleth_layer.add_to(map)

# 국가로 인덱스된 데이터프레임을 생성
player_data_indexed = country_player_count.set_index("country")

# 툴팁에 표시할 내용을 geojson.data에 추가
for s in choropleth_layer.geojson.data["features"]:
    region_name = s["properties"]["name"]
    if region_name not in player_data_indexed.index:
        count_value = ""
        server_value = ""
    else:
        count_value = str(
            player_data_indexed.loc[region_name, "count"]
        )  # Convert to str
        server_value = str(
            player_data_indexed.loc[region_name, "region"]
        )  # Convert to str
    s["properties"]["count"] = count_value
    s["properties"]["server"] = server_value

tooltip = folium.GeoJsonTooltip(fields=["name", "server", "count"], labels=True)
choropleth_layer.geojson.add_child(tooltip)

folium.TileLayer("openstreetmap").add_to(map)
folium.LayerControl().add_to(map)
map
Out[29]:
Make this Notebook Trusted to load map: File -> Trust Notebook

지역별 Top10 플레이어의 게임 시간 분포¶

In [30]:
# 서브플롯 설정
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(12, 10))
sns.set_theme(style='whitegrid', font=font_family)

# 각 region 별로 히스토그램 그리기
for (region, ax) in zip(revised_match['region'].unique(), axes.flatten()):
    sns.boxplot(data=revised_match[revised_match['region'] == region], y='timestamp', color='skyblue', ax=ax)
    ax.set_ylim(revised_match['timestamp'].min()*0.9999, revised_match['timestamp'].max()*1.0001)
    ax.set_title(f'Region {region}')

# 레이아웃 조정
plt.tight_layout()
plt.show()
In [31]:
top10_player = player_stat.loc[:, ['puuid']]
top10_player_match = top10_player.merge(match_player, how='inner', left_on='puuid', right_on = 'puuid')
# top10_player_match = top10_player_match.drop_duplicates('match_id')
top10_player_match = top10_player_match.merge(player, how='inner', left_on='puuid', right_on='puuid')
top10_player_match = top10_player_match.merge(revised_match, how='inner', left_on='match_id', right_on='match_id')
In [32]:
top10_player_match_reg = top10_player_match[top10_player_match['region_x']=='NA1']
# 서브플롯 설정
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(10, 5))
sns.set_theme(style='whitegrid', font=font_family)

# 각 region 별로 히스토그램 그리기
for (name, ax) in zip(top10_player_match_reg['name'].unique(), axes.flatten()):
    sns.boxplot(data=top10_player_match_reg[top10_player_match_reg['name'] == name], y='timestamp', color='skyblue', ax=ax)
    ax.set_ylim(top10_player_match['timestamp'].min()*0.9999, top10_player_match['timestamp'].max()*1.0001)
    ax.set_title(f'{name}', fontsize=9)
    ax.set_yticklabels("")

# 레이아웃 조정
plt.tight_layout()
plt.show()

연관 규칙 분석¶

특성 별 연관규칙 분석¶

In [33]:
# 전체 티어가 1인 특성과 현제 티어가 0인 특성 제외
revised_match_trait = preprocessed_match_trait[preprocessed_match_trait["tier_total"] > 1]
revised_match_trait = revised_match_trait[revised_match_trait["tier_current"] > 0]
revised_match_trait = revised_match_trait.groupby("match_player_id")[["name"]].agg(list)
In [34]:
# 버전 정보를 얻기 위해서 병합 작업을 수행
merged_match_trait = match.merge(
    preprocessed_match_player, how="inner", left_on="match_id", right_on="match_id"
)
merged_match_trait = merged_match_trait.merge(
    revised_match_trait,
    how="inner",
    left_on="match_player_id",
    right_on="match_player_id",
)

# 가장 최신 버전에 해당하는 기록만 남김
latest_match_trait = merged_match_trait[
    (merged_match_trait["version_major"] == VERSION_MAJOR)
    & (merged_match_trait["version_minor"] == VERSION_MINOR)
    & (merged_match_trait["version_patch"] == VERSION_PATCH)
]
latest_match_trait = merged_match_trait.loc[:, ["match_player_id", "name"]]

# 전처리 후 지지도 계산
te = TransactionEncoder()
te_result = te.fit_transform(latest_match_trait["name"])
td_df = pd.DataFrame(te_result, columns=te.columns_)
freq_items = apriori(td_df, min_support=0.05, use_colnames=True)

# freq_items.sort_values('support', ascending=False).head(15)

# 연관규칙 분석
rules = association_rules(freq_items, metric="confidence", min_threshold=0.8)
rules["antecedents_str"] = rules["antecedents"].apply(lambda x: ",".join(list(x)))
rules["consequents_str"] = rules["consequents"].apply(lambda x: ",".join(list(x)))
# rules.head()
In [35]:
# 점 그래프 생성
fig = px.scatter(
    rules,
    x="support",
    y="confidence",
    size="lift",
    color="lift",
    custom_data=["antecedents_str", "consequents_str", "lift"],
    color_continuous_scale=px.colors.sequential.Jet,
    title="특성 조합 연관규칙",
    labels={"support": "지지도", "confidence": "신뢰도", "lift": "향상도"},
)
# 툴팁 추가
fig.update_traces(
    hovertemplate="<br>".join(
        [
            "<b>조건</b>: %{customdata[0]}",
            "<b>결과</b>: %{customdata[1]}<br>",
            "<b>지지도</b>: %{x:.3%}",
            "<b>신뢰도</b>: %{y:.3%}",
            "<b>향상도</b>: %{customdata[2]:.4f}",
        ]
    )
)
# 그래프 출력 사이즈 조절
fig.update_layout(
    width=1100,  # 너비
    height=600,  # 높이
    hovermode='closest'  # 가장 가까운 데이터 포인트의 툴팁 표시
)
# 그래프 표시
fig.show()

각 특성의 신뢰도 히트맵 표현¶

In [36]:
# 신뢰도 기준 0으로 설정 후 계산
rules_conf0 = association_rules(freq_items, metric="confidence", min_threshold=0)
rules_conf0["antecedents_str"] = rules_conf0["antecedents"].apply(
    lambda x: ",".join(list(x))
)
rules_conf0["consequents_str"] = rules_conf0["consequents"].apply(
    lambda x: ",".join(list(x))
)

# 조건절과 결과절의 개수가 1인 것만 추출
single_condition = (rules_conf0["antecedents"].apply(len) == 1) & (
    rules_conf0["consequents"].apply(len) == 1
)
rules_single = rules_conf0[single_condition].loc[
    :, ["antecedents_str", "consequents_str", "confidence"]
]

# 히트맵에 사용될 특성
target_trait = sorted(rules_single["antecedents_str"].unique())

# 히트맵에 사용할 데이터 프레임 생성
revised_rules_df = pd.DataFrame(index=target_trait)
In [37]:
# 대상 특성 순회
for col in target_trait:
    # 열 설정을 위한 빈 리스트 생성 : row -> col의 신뢰도를 저장합니다.
    col_conf = []
    for row in target_trait:
        confidence = rules_single[
            (rules_single["antecedents_str"] == row)
            & (rules_single["consequents_str"] == col)
        ]["confidence"]

        # confidence값이 없을 경우는 0으로 설정
        if confidence.shape[0] > 0:
            col_conf.append(confidence.values[0])
        else:
            col_conf.append(0)
    revised_rules_df[col] = col_conf
In [38]:
fig = plt.figure(figsize=(15, 10))
fig.suptitle("각 특성의 신뢰도", x=0.45)
ax = fig.add_subplot(1, 1, 1)
sns.heatmap(
    revised_rules_df,
    annot=True,
    ax=ax,
    cmap=sns.color_palette("Blues", as_cmap=True),
    linewidths=0.01,
)

plt.tight_layout()
plt.show()